mightandmagicfandomcom-20200222-history
User talk:Imasolmyr/Heroes III Index
!/usr/bin/env python3 import pathlib import urllib.request import urllib.parse import json import sys if not pathlib.Path('output').exists(): pathlib.Path('output').mkdir() all_pages_output_file = open('output/AllPages.mediawiki', 'w') category_output_files = {} base_api_url = 'http://mightandmagic.wikia.com/' base_url = 'http://mightandmagic.wikia.com/wiki/' page_titles_by_category = {} category_titles_by_category = {} def main(): base_page_query = base_api_url + 'api.php?action=query&list=allpages&aplimit=500&apfilterredir=nonredirects&format=json&apfrom=' continue_title = '' page_titles = {} while True: try: request = urllib.request.urlopen(base_page_query + urllib.parse.quote(continue_title.replace(' ', '_'))) except Exception as e: print('\n\n\n\n') print(continue_title) print(e) print('\n\n\n\n') sys.exit() response = request.read() results = json.loads(response.decode()) for page in results'query''allpages': title = page'title' page_titlestitle = base_url + title if 'query-continue' in results: continue_title = results'query-continue''allpages''apfrom' else: break page_title_queries = [] page_title_query = '' for title, url in page_titles.items(): if title != 'Main Page': page_title_query += title + '|' if len(page_title_query) > 50: page_title_queries.append(page_title_query.rstrip('|')) page_title_query = '' prop_page_query = base_api_url + 'api.php?action=query&prop=categories&format=json&titles=' for page_title_query in page_title_queries: try: request = urllib.request.urlopen(prop_page_query + urllib.parse.quote(page_title_query.replace(' ', '_'))) except Exception as e: print('\n\n\n\n') print(page_title_query) print(e) print('\n\n\n\n') sys.exit() response = request.read() results = json.loads(response.decode()) for _, result in results'query''pages'.items(): if 'categories' not in result: # print('\n\n') # print(result_title + ' has no categories.') # print('\n\n') continue result_title = result'title' progress = result_title + ' : ' for category in result'categories': category_title = category'title' if category_title not in page_titles_by_category: page_titles_by_categorycategory_title = set() page_titles_by_categorycategory_title.add(result_title) progress += category_title + ', ' print(progress.rstrip(', ')) base_category_query = base_api_url + 'api.php?action=query&list=allcategories&aclimit=500&acprop=size&format=json&acfrom=' continue_category = '' category_titles = {} while True: try: request = urllib.request.urlopen(base_category_query + urllib.parse.quote(continue_category.replace(' ', '_'))) except Exception as e: print('\n\n\n\n') print(continue_category) print(e) print('\n\n\n\n') sys.exit() response = request.read() results = json.loads(response.decode()) for category in results'query''allcategories': if category'size' is 0 or category'size' is '0': continue title = category'*' category_titlestitle = base_url + title if 'query-continue' in results: continue_category = results'query-continue''allcategories''acfrom' else: break category_title_queries = [] category_title_query = '' for title, url in category_titles.items(): category_title_query += 'Category:' + title + '|' if len(category_title_query) > 50: category_title_queries.append(category_title_query.rstrip('|')) category_title_query = '' prop_category_query = base_api_url + 'api.php?action=query&prop=categories&format=json&titles=' for category_title_query in category_title_queries: try: request = urllib.request.urlopen(prop_category_query + urllib.parse.quote(category_title_query.replace(' ', '_'))) except Exception as e: print('\n\n\n\n') print(category_title_query) print(e) print('\n\n\n\n') sys.exit() response = request.read() results = json.loads(response.decode()) for _, result in results'query''pages'.items(): if 'categories' not in result: # print('\n\n') # print(result_title + ' has no categories.') # print('\n\n') continue result_title = result'title' progress = result_title + ' : ' for category in result'categories': category_title = category'title' if category_title not in category_titles_by_category: category_titles_by_categorycategory_title = set() category_titles_by_categorycategory_title.add(result_title) progress += category_title + ', ' print(progress.rstrip(', ')) print('\n\n') print(page_titles_by_category) print('\n\n') print(category_titles_by_category) for category_title in sorted(category_titles_by_categoryIII'): category_file_name = .join(for i in category_title if i.isalpha()) category_output_filescategory_file_name = open('output/' + category_file_name + '.mediawiki', 'w') print_titles(category_file_name, category_title) def print_titles(category_file_name, category_title, category_level='=', indent_level=): print(category_title) all_pages_output_file.write(category_level + ' + base_url + category_title.replace(' ', '_') + ' ' + indent_level + category_title.split('Category:', 1).pop() + ' ' + category_level + '\n\n') category_output_filescategory_file_name.write(category_level + ' + base_url + category_title.replace(' ', '_') + ' ' + indent_level + category_title.split('Category:', 1).pop() + ' ' + category_level + '\n\n') category_level += '=' indent_level += ' ' if category_title in page_titles_by_category: for page_title in sorted(page_titles_by_categorycategory_title): all_pages_output_file.write('+ base_url + page_title.replace(' ', '_') + ' ' + indent_level + page_title + '\n\n') category_output_filescategory_file_name.write('+ base_url + page_title.replace(' ', '_') + ' ' + indent_level + page_title + '\n\n') if category_title in category_titles_by_category: for child_category_title in sorted(category_titles_by_categorycategory_title): print_titles(category_file_name, child_category_title, category_level, indent_level) if __name__ "__main__": main()